*-------------------------------------------------------------------------------
* Objective: Create Merchandise Export % in GDP Data
*-------------------------------------------------------------------------------

********************************************************************************
**# Step 1.1: Load and format Total Merchanidse Export Value
/*******************************************************************************
Source: World Bank WDI
Notes: We take 0s as missing values
       Data accessed 12.03.2020
*******************************************************************************/

clear all
import excel "raw_datasets/merch_x.xlsx", sheet("Data") firstrow clear

* Genereting missing values
quietly{
	foreach var of varlist y1960-y2019{
	replace `var' = "" if `var' == ".." | `var' == "0"
	}
	destring y*, replace
}


* Matching variable name of country code with the one used in the main dataset
rename CountryCode code_wb

* Generate 2020 based on 2019
rename y2019 y2020

* Reshaping into long shape and renaming variable
reshape long y, i(code_wb) j(year)
rename y merch_x

* Keep only desired countries
merge m:1 code_wb using "processed_datasets/ccode", keepusing(country_wb)
keep if _merge == 3

* Cleaning
drop _merge country_wb CountryName SeriesName SeriesCode 

* Save dataset
save "processed_datasets/dataset_merchgdp", replace

********************************************************************************
**# Step 1.2: Load and format GDP
/*******************************************************************************
Source: World Bank WDI (NO BETA)
Notes: We take 0s as missing values
       Data accessed 12.03.2020
*******************************************************************************/

clear all
import excel "raw_datasets/gdp.xlsx", sheet("Data") firstrow clear

quietly{
	foreach var of varlist y1960-y2019{
	replace `var' = "" if `var' == ".." | `var' == "0"
	}
	destring y*, replace
}

* Format for consistency
rename CountryCode code_wb
rename y2019 y2020
reshape long y, i(code_wb) j(year)
rename y gdp

* Keep only desired countries
merge m:1 code_wb using "processed_datasets/ccode", keepusing(country_wb)
keep if _merge == 3
drop _merge country_wb CountryName SeriesName SeriesCode 

* Merge with main_dataset
tempfile gdpdata
save `gdpdata', replace
clear all
use "processed_datasets/dataset_merchgdp", clear
merge 1:1 code_wb year using `gdpdata'
drop _merge
save "processed_datasets/dataset_merchgdp", replace

* We drop South Sudan now and modify Sudan later to construct Former Sudan
drop if code_wb == "SSD"

/*******************************************************************************
* Creating merchandise exports share in GDP
*******************************************************************************/
gen merch_sh_gdp = (merch_x/gdp)*100

********************************************************************************
**# Step 2: Missing Data
/*******************************************************************************
We have estimated the variable 'merch_sh_gdp' using WDI data. For 1960-2005 we 
will later replace this variable with data from GJV2016. For post 2010 
observations we will complement WB WDI data with data from FAO. But first
we try to improve 'merch_sh_gdp' as best as possible.
*******************************************************************************/

********************************************************************************
**# Step 2.1: Missing Data - Construct Former Sudan
/*******************************************************************************
We take 2011 as the last year in which data reflects Former Sudan
We take Merchandise X values from FAO and GDP values from WDI
Because FAO data only goes until 2018, we leave the shares as missing for 2020.
In the case of South Sudan WDI data does not contain GDP values after 2015, so 
we take 2015 values for 2016, 2017, 2018. FAO data does not contain Merch X 
values before 2014, so we take 2014 values for 2012 and 2013.
*******************************************************************************/

replace merch_sh_gdp = 13.10981296 if code_wb == "SDN" & year == 2012
replace merch_sh_gdp = 15.10147209 if code_wb == "SDN" & year == 2013
replace merch_sh_gdp = 11.108199   if code_wb == "SDN" & year == 2014
replace merch_sh_gdp = 6.383176465 if code_wb == "SDN" & year == 2015
replace merch_sh_gdp = 7.172152446 if code_wb == "SDN" & year == 2016
replace merch_sh_gdp = 9.374754246 if code_wb == "SDN" & year == 2017
replace merch_sh_gdp = 13.51610677 if code_wb == "SDN" & year == 2018
replace merch_sh_gdp = .           if code_wb == "SDN" & year == 2020

********************************************************************************
**# Step 2.2: Missing Data - Approximating 2020
********************************************************************************

* Missings close to 2020 are fixed by using the closest value between 2016-2018
sort code_wb year
gen identi = 0
by code_wb: replace identi = 1 if merch_sh_gdp != . & merch_sh_gdp[60] == . & year == 2018
by code_wb: replace merch_sh_gdp = merch_sh_gdp[59] if merch_sh_gdp == . & year == 2020
by code_wb: replace identi = 1 if merch_sh_gdp != . & merch_sh_gdp[60] == . & year == 2017
by code_wb: replace merch_sh_gdp = merch_sh_gdp[58] if merch_sh_gdp == . & year == 2020
by code_wb: replace identi = 1 if merch_sh_gdp != . & merch_sh_gdp[60] == . & year == 2016
by code_wb: replace merch_sh_gdp = merch_sh_gdp[57] if merch_sh_gdp == . & year == 2020
replace merch_sh_gdp = . if identi == 1
drop identi

********************************************************************************
**# Step 2.3: Missing Data - Applying Interpolation
********************************************************************************

sort code_wb year
by code_wb: ipolate merch_sh_gdp year, gen(intmgdp)
replace merch_sh_gdp = intmgdp if merch_sh_gdp == .
drop intmgdp

*-------------------------------------------------------------------------------
* We use data from GJV2016 to fix missings and aid in interpolation later

rename code_wb ccode
replace ccode = "ZAR" if ccode == "COD"
merge 1:1 ccode year using "raw_datasets/exports_gjv", keepusing(merch_x_gdp) 
replace ccode = "COD" if ccode == "ZAR"
rename ccode code_wb
replace merch_sh_gdp = merch_x_gdp if merch_sh_gdp == .
sort code_wb year

********************************************************************************
**# Step 2.3: Missing Data - Specific Cases
********************************************************************************

* Syria - We use Merch. X from WDI data. GDP comes from FED Syrian GDP estimates.
* We take 2017 as 2020

replace merch_sh_gdp = 7.6 if year == 2015 & code_wb == "SYR"
replace merch_sh_gdp = 8.8 if year == 2020 & code_wb == "SYR"

* Taiwan - We use Merch. X from FAO data and GDP from Taiwan official statistics
* We take 2018 as 2020
 
replace merch_sh_gdp = 0.6370 * 100 if year == 2011 & code_wb == "TWN"
replace merch_sh_gdp = 0.6078 * 100 if year == 2012 & code_wb == "TWN"
replace merch_sh_gdp = 0.5955 * 100 if year == 2013 & code_wb == "TWN"
replace merch_sh_gdp = 0.5979 * 100 if year == 2014 & code_wb == "TWN"
replace merch_sh_gdp = 0.5339 * 100 if year == 2015 & code_wb == "TWN"
replace merch_sh_gdp = 0.5162 * 100 if year == 2016 & code_wb == "TWN"
replace merch_sh_gdp = 0.5370 * 100 if year == 2017 & code_wb == "TWN"
replace merch_sh_gdp = 0.5513 * 100 if year == 2020 & code_wb == "TWN"

* Venezuela - We use Merch. X from WB data and GDP from IMF World Econ. Outlook
* We take 2018 as 2020

replace merch_sh_gdp = 0.114632346 * 100 if year == 2015 & code_wb == "VEN"
replace merch_sh_gdp = 0.092224411 * 100 if year == 2016 & code_wb == "VEN"
replace merch_sh_gdp = 0.223581758 * 100 if year == 2017 & code_wb == "VEN"
replace merch_sh_gdp = 0.344955654 * 100 if year == 2018 & code_wb == "VEN"
replace merch_sh_gdp = 0.257270169 * 100 if year == 2020 & code_wb == "VEN"

* Eritrea - We use Merch. X from WB data and GDP from UN DATA (GDP by Type of Expenditure at current prices - US dollars)
* We take 2018 as 2020

replace merch_sh_gdp = 9.8 if year == 2020 & code_wb == "ERI"
replace merch_sh_gdp = . if year == 2018 & code_wb == "ERI"
replace merch_sh_gdp = 9.8 if year == 2017 & code_wb == "ERI"
replace merch_sh_gdp = 8.7 if year == 2016 & code_wb == "ERI"
replace merch_sh_gdp = 11.2 if year == 2015 & code_wb == "ERI"
replace merch_sh_gdp = 15.9 if year == 2014 & code_wb == "ERI"
replace merch_sh_gdp = 9 if year == 2013 & code_wb == "ERI"
replace merch_sh_gdp = 15.7 if year == 2012 & code_wb == "ERI"
replace merch_sh_gdp = 16.5 if year == 2011 & code_wb == "ERI"

* Somalia - We use Merch. X from IMF DOTS and GDP from UN DATA (GDP by Type of Expenditure at current prices - US dollars)
* We take 2018 as 2020

replace merch_sh_gdp = 35.9 if year == 2020 & code_wb == "SOM"
replace merch_sh_gdp = . if year == 2018 & code_wb == "SOM"
replace merch_sh_gdp = 31.0 if year == 2017 & code_wb == "SOM"
replace merch_sh_gdp = 43.2 if year == 2016 & code_wb == "SOM"
replace merch_sh_gdp = 46.6 if year == 2015 & code_wb == "SOM"
replace merch_sh_gdp = 40.6 if year == 2014 & code_wb == "SOM"
replace merch_sh_gdp = 35.8 if year == 2013 & code_wb == "SOM"
replace merch_sh_gdp = 30.5 if year == 2012 & code_wb == "SOM"
replace merch_sh_gdp = 41.9 if year == 2011 & code_wb == "SOM"

*-------------------------------------------------------------------------------
* Filling with interpolation
by code_wb: ipolate merch_sh_gdp year, gen(intmgdp)
replace merch_sh_gdp = intmgdp if merch_sh_gdp == .
drop intmgdp

********************************************************************************
**# Step 3: Calculate moving averages
********************************************************************************

* Sorting
sort code_wb year

* Moving Average (-1/+1)

* Generating Variable ma1
gen merch_sh_gdp_ma1 = 0

* Estimating ma1 for the sample except 1960 and 2020
foreach ma1 of numlist 2/59{
	by code_wb: replace merch_sh_gdp_ma1 = (merch_sh_gdp[`ma1'-1] + merch_sh_gdp[`ma1'] + merch_sh_gdp[`ma1'+1])/3 if _n == `ma1' 
}

* Estimating ma1 for 1960 and 2020
by code_wb: replace merch_sh_gdp_ma1 = (merch_sh_gdp[1] + merch_sh_gdp[2])/2 if _n == 1
by code_wb: replace merch_sh_gdp_ma1 = (merch_sh_gdp[59] + merch_sh_gdp[60])/2 if _n == 60

* Taking moving averages with a window of 2

* Generating Variable ma2
gen merch_sh_gdp_ma2 = 0
foreach ma2 of numlist 3/58{
	by code_wb: replace merch_sh_gdp_ma2 = ///
	(merch_sh_gdp[`ma2'-2] + merch_sh_gdp[`ma2'-1] + merch_sh_gdp[`ma2'] + merch_sh_gdp[`ma2'+1] + merch_sh_gdp[`ma2'+2])/5 ///
	if _n == `ma2' 
}
	
* Estimating ma2 for 1960, 1962, 2017 and 2020
by code_wb: replace merch_sh_gdp_ma2 = (merch_sh_gdp[1] + merch_sh_gdp[2] + merch_sh_gdp[3])/3 if _n == 1
by code_wb: replace merch_sh_gdp_ma2 = (merch_sh_gdp[1] + merch_sh_gdp[2] + merch_sh_gdp[3] + merch_sh_gdp[4])/4 if _n == 2
by code_wb: replace merch_sh_gdp_ma2 = (merch_sh_gdp[57] + merch_sh_gdp[58] + merch_sh_gdp[59] + merch_sh_gdp[60])/4 if _n == 59
by code_wb: replace merch_sh_gdp_ma2 = (merch_sh_gdp[58] + merch_sh_gdp[59] + merch_sh_gdp[60])/3 if _n == 60

********************************************************************************
**# Step 4: Improving data
/*******************************************************************************
We now calculate the share of merch. exports in gdp with an alternative source
for total merchandise exports. We use instead FAO as a source. Merchandise
exports from FAO data had already been estimated in the do file 
'natres_data_creator.do'. We still use World Bank data as a source for GDP. The
resulting variable ('merch_sh_gdp_fao') will be used to correct values of the
already estimated 'merch_sh_gdp' that seem odd based on drastic changes 
on its value when constrated to the rest of the series of the country. 
*******************************************************************************/

* Import Data
drop _merge
merge 1:1 code_wb year using "processed_datasets/dataset_agriexp.dta", keepusing(merchfao_x)

/*******************************************************************************
We create the variable gdp_alt which is the same as gdp but uses 2018 as 
approximation to 2020 instead of 2019. This is necessary given that FAO uses 
2018 as approximation to 2020 and we need to keep consistency when creating the 
variable merch_sh_gdp_fao.
*******************************************************************************/

sort code_wb year
gen double gdp_alt = gdp
by code_wb: replace gdp_alt = gdp_alt[59] if _n == 60
replace gdp_alt = . if year == 2018

* Create share of Merch X in GDP using FAO data for Merch X
gen merch_sh_gdp_fao = (merchfao_x*1000/gdp_alt)*100

*-------------------------------------------------------------------------------
* Updating 2020

sort code_wb year
gen identi = 0
by code_wb: replace identi = 1 if merch_sh_gdp_fao != . & merch_sh_gdp_fao[60] == . & year == 2018
by code_wb: replace merch_sh_gdp_fao = merch_sh_gdp_fao[59] if merch_sh_gdp_fao == . & year == 2020
by code_wb: replace identi = 1 if merch_sh_gdp_fao != . & merch_sh_gdp_fao[60] == . & year == 2017
by code_wb: replace merch_sh_gdp_fao = merch_sh_gdp_fao[58] if merch_sh_gdp_fao == . & year == 2020
by code_wb: replace identi = 1 if merch_sh_gdp_fao != . & merch_sh_gdp_fao[60] == . & year == 2016
by code_wb: replace merch_sh_gdp_fao = merch_sh_gdp_fao[57] if merch_sh_gdp_fao == . & year == 2020
replace merch_sh_gdp_fao = . if identi == 1
drop identi

*-------------------------------------------------------------------------------
* Filling with interpolation

sort code_wb year
by code_wb: ipolate merch_sh_gdp_fao year, gen(intmerch_sh_gdp_fao)
replace merch_sh_gdp_fao = intmerch_sh_gdp_fao if merch_sh_gdp_fao == .
drop intmerch_sh_gdp_fao

*-------------------------------------------------------------------------------
* Replace missings with GJV2016

replace merch_sh_gdp_fao = merch_x_gdp if merch_sh_gdp_fao == .

*-------------------------------------------------------------------------------
* Filling with interpolation

sort code_wb year
by code_wb: ipolate merch_sh_gdp_fao year, gen(intmerch_sh_gdp_fao)
replace merch_sh_gdp_fao = intmerch_sh_gdp_fao if merch_sh_gdp_fao == .
drop intmerch_sh_gdp_fao

*-------------------------------------------------------------------------------
* Estimating Moving Averages

* Sorting
sort code_wb year

* Moving Average (-1/+1)

* Generating Variable ma1
gen merch_sh_gdp_fao_ma1 = 0

* Estimating ma1 for the sample except 1960 and 2020
foreach ma1 of numlist 2/59{
	by code_wb: replace merch_sh_gdp_fao_ma1 = (merch_sh_gdp_fao[`ma1'-1] + merch_sh_gdp_fao[`ma1'] + merch_sh_gdp_fao[`ma1'+1])/3 if _n == `ma1' 
}

* Estimating ma1 for 1960 and 2020
by code_wb: replace merch_sh_gdp_fao_ma1 = (merch_sh_gdp_fao[1] + merch_sh_gdp_fao[2])/2 if _n == 1
by code_wb: replace merch_sh_gdp_fao_ma1 = (merch_sh_gdp_fao[59] + merch_sh_gdp_fao[60])/2 if _n == 60

* Taking moving averages with a window of 2

* Generating Variable ma2
gen merch_sh_gdp_fao_ma2 = 0

foreach ma2 of numlist 3/58{
	by code_wb: replace merch_sh_gdp_fao_ma2 = ///
	(merch_sh_gdp_fao[`ma2'-2] + merch_sh_gdp_fao[`ma2'-1] + merch_sh_gdp_fao[`ma2'] + merch_sh_gdp_fao[`ma2'+1] + merch_sh_gdp_fao[`ma2'+2])/5 ///
	if _n == `ma2' 
}
	
* Estimating ma2 for 1960, 1962, 2017 and 2020
by code_wb: replace merch_sh_gdp_fao_ma2 = (merch_sh_gdp_fao[1] + merch_sh_gdp_fao[2] + merch_sh_gdp_fao[3])/3 if _n == 1
by code_wb: replace merch_sh_gdp_fao_ma2 = (merch_sh_gdp_fao[1] + merch_sh_gdp_fao[2] + merch_sh_gdp_fao[3] + merch_sh_gdp_fao[4])/4 if _n == 2
by code_wb: replace merch_sh_gdp_fao_ma2 = (merch_sh_gdp_fao[57] + merch_sh_gdp_fao[58] + merch_sh_gdp_fao[59] + merch_sh_gdp_fao[60])/4 if _n == 59
by code_wb: replace merch_sh_gdp_fao_ma2 = (merch_sh_gdp_fao[58] + merch_sh_gdp_fao[59] + merch_sh_gdp_fao[60])/3 if _n == 60

********************************************************************************
**# Step 4.1: Improving data - Combining all sources for merch_sh_gdp
********************************************************************************

keep if year == 1960 | year == 1965 | year == 1970 | year == 1975 | year == 1980 | year == 1985 | year == 1990 | year == 1995 ///
| year == 2000 | year == 2005 | year == 2010 | year == 2015 | year == 2020

********************************************************************************
*Replace merch_sh_gdp values pre-2010 with GJV2016 data
********************************************************************************
gen merch_sh_gdp_c = merch_x_gdp
replace merch_sh_gdp_c = merch_sh_gdp if year > 2005
gen merch_sh_gdp_c_ma1 = merch_x_gdp
replace merch_sh_gdp_c_ma1 = merch_sh_gdp_ma1 if year > 2005
gen merch_sh_gdp_c_ma2 = merch_x_gdp
replace merch_sh_gdp_c_ma2 = merch_sh_gdp_ma2 if year > 2005

********************************************************************************
* Modify variables odd values with merch_sh_gdp_fao
********************************************************************************

replace merch_sh_gdp_c = merch_sh_gdp_fao if year > 2011 & code_wb == "DJI"
replace merch_sh_gdp_c_ma1 = merch_sh_gdp_fao_ma1 if year > 2011 & code_wb == "DJI"
replace merch_sh_gdp_c_ma2 = merch_sh_gdp_fao_ma2 if year > 2011 & code_wb == "DJI"

replace merch_sh_gdp_c = merch_sh_gdp_fao if year > 2016 & code_wb == "VNM"
replace merch_sh_gdp_c_ma1 = merch_sh_gdp_fao_ma1 if year > 2016 & code_wb == "VNM"
replace merch_sh_gdp_c_ma2 = merch_sh_gdp_fao_ma2 if year > 2016 & code_wb == "VNM"

********************************************************************************
* Improving merch_sh_gdp for countries with bad/odd data
********************************************************************************

*Hong Kong: Source: (https://oec.world/en/profile/country/hkg?tradeScaleSelector1=tradeScale0&yearSelector1=exportGrowthYear16) - And 2010 from GJV2016

replace merch_sh_gdp_c = 53.6 if year == 2010 & code_wb == "HKG"
replace merch_sh_gdp_c = 34 if year == 2015 & code_wb == "HKG"
replace merch_sh_gdp_c = 34 if year == 2020 & code_wb == "HKG"

replace merch_sh_gdp_c_ma1 = 53.6 if year == 2010 & code_wb == "HKG"
replace merch_sh_gdp_c_ma1 = 34 if year == 2015 & code_wb == "HKG"
replace merch_sh_gdp_c_ma1 = 34 if year == 2020 & code_wb == "HKG"

replace merch_sh_gdp_c_ma2 = 53.6 if year == 2010 & code_wb == "HKG"
replace merch_sh_gdp_c_ma2 = 34 if year == 2015 & code_wb == "HKG"
replace merch_sh_gdp_c_ma2 = 34 if year == 2020 & code_wb == "HKG"

*Singapore: Source: Using share of re-exports of 2012 from Singapore official statistics - 2010 from GJV2016

replace merch_sh_gdp_c = 47.3 if year == 2010 & code_wb == "SGP"
replace merch_sh_gdp_c = 64 if year == 2015 & code_wb == "SGP"
replace merch_sh_gdp_c = 59 if year == 2020 & code_wb == "SGP"

replace merch_sh_gdp_c_ma1 = 47.3 if year == 2010 & code_wb == "SGP"
replace merch_sh_gdp_c_ma1 = 64 if year == 2015 & code_wb == "SGP"
replace merch_sh_gdp_c_ma1 = 59 if year == 2020 & code_wb == "SGP"

replace merch_sh_gdp_c_ma2 = 47.3 if year == 2010 & code_wb == "SGP"
replace merch_sh_gdp_c_ma2 = 64 if year == 2015 & code_wb == "SGP"
replace merch_sh_gdp_c_ma2 = 59 if year == 2020 & code_wb == "SGP"

********************************************************************************
**# Step 5: Finalize dataset
********************************************************************************

* Keeping desired variables
keep code_wb year merch_sh_gdp_c merch_sh_gdp_c_ma1 merch_sh_gdp_c_ma2

* Save dataset
save "processed_datasets/dataset_merchgdp", replace
